Stock Market Prediction

Project Summary

Results

  • My selling rules need to be based on ML -- maybe something I run each day that predicts whether or not it will hit my goal considering I have one more day of data. each day something will be run to predict probability of still hitting target. if it becomes less than 50%, sell.

Setup


In [211]:
cat('Last run: ', "\n", format(Sys.time(), "%a %b %d %X %Y"))


Last run:  
 Thu Sep 14 1:29:27 PM 2017

In [ ]:
pacman::p_load(feather, tidyverse, tabplot, titanic, devtools, caret, gmodels, lubridate, 
               ggjoy, Amelia, ggvis, rattle, corrplot, caretEnsemble, GGally, quantmod, TTR, sqldf, tidyquant,
               PerformanceAnalytics, dygraphs, magrittr
               , install = TRUE, update = getOption("pac_update"), character.only = FALSE)
pacman::p_load_gh("clauswilke/ggjoy","rstats-db/odbc","rstudio/rmarkdown",'IRkernel/IRkernel' ) #"business-science/tidyquant"

IRkernel::installspec()  # to register the kernel in the current R installation
options(warn=-1)

In [213]:
stdf  <- tidyquant::tq_get("QQQ", get = "stock.prices", from = "2011-01-01") #GLD JJC UGA JJN SLV SPXL

In [214]:
head(stdf)
str(stdf)
tail(stdf)


dateopenhighlowclosevolumeadjusted
2011-01-0359.222 59.998 59.201 55.31 90994300 51.33858
2011-01-0459.804 59.847 59.168 55.27 43851500 51.30145
2011-01-0559.362 60.073 59.330 55.74 43787400 51.73770
2011-01-0660.117 60.289 59.987 55.92 46139600 51.90478
2011-01-0760.354 60.386 59.599 55.87 72261700 51.85837
2011-01-1060.063 60.526 59.880 56.08 55551800 52.05329
Classes 'tbl_df', 'tbl' and 'data.frame':	1686 obs. of  7 variables:
 $ date    : Date, format: "2011-01-03" "2011-01-04" ...
 $ open    : num  59.2 59.8 59.4 60.1 60.4 ...
 $ high    : num  60 59.8 60.1 60.3 60.4 ...
 $ low     : num  59.2 59.2 59.3 60 59.6 ...
 $ close   : num  55.3 55.3 55.7 55.9 55.9 ...
 $ volume  : num  90994300 43851500 43787400 46139600 72261700 ...
 $ adjusted: num  51.3 51.3 51.7 51.9 51.9 ...
dateopenhighlowclosevolumeadjusted
2017-09-06145.06 145.46 144.08 145.13 30722700 145.13
2017-09-07145.40 145.84 144.94 145.47 22721100 145.47
2017-09-08145.28 145.35 144.06 144.21 27815800 144.21
2017-09-11145.30 146.12 145.30 145.87 27126300 145.87
2017-09-12146.25 146.41 145.53 146.22 30256700 146.22
2017-09-13145.97 146.44 145.73 146.42 28200400 146.42

In [215]:
# Create my target variable
leadn <- 16

stdf <- mutate(stdf,
    #rolling window max close price for last n days
    runMaxHigh = runMax(stdf[,c("high")], n = leadn, cumulative = F),
    #what will be the high in the next 15 days?
    Max15 = lead(runMaxHigh, n=leadn, order_by = date),
    # running min low over last n days
    runMinLow = runMin(stdf[,c("low")], n = leadn, cumulative = F),
    #what will be the low in the next 15 days?
    Min15 = lead(runMinLow, n=leadn, order_by = date),
    #find future prices and their percent gains
    lead15 = lead(high, n=leadn, order_by = date),
    # Percent gain or loss #(Price Sold - Purchase Price)/(Purchase Price)
    percentchange15 = round((100 * (lead15 - high)/(high)),2),
    # create open price tomorrow, because that's what I'll be buying at           
    openpricetomorrow = lead(open, n=1, order_by = date),
    #if i buy today at tomorrow's open, will the price jump n% where I can sell it in the next 15 days
    #max15 as a percentage gain from Close price 15 days ago
    percentchangeMax15 = round((100 * (Max15 - openpricetomorrow)/(openpricetomorrow)),2),       
    # create target
    label_twopercent = case_when( 
        percentchangeMax15 < 2 ~ "class0",
        percentchangeMax15 >= 2 ~ "class1"
        )
    


    ) #end mutate

#convert target to factor
stdf$label_twopercent %<>% factor

# for class probabilities, have to change Class values with prefix "class". this prevents error
# and sets up for final results like i need them for the competition
#stdf$label_twopercent <- sub("^", "class", stdf$label_twopercent)

glimpse(stdf)


Observations: 1,686
Variables: 16
$ date               <date> 2011-01-03, 2011-01-04, 2011-01-05, 2011-01-06,...
$ open               <dbl> 59.222, 59.804, 59.362, 60.117, 60.354, 60.063, ...
$ high               <dbl> 59.998, 59.847, 60.073, 60.289, 60.386, 60.526, ...
$ low                <dbl> 59.201, 59.168, 59.330, 59.987, 59.599, 59.880, ...
$ close              <dbl> 55.31, 55.27, 55.74, 55.92, 55.87, 56.08, 56.16,...
$ volume             <dbl> 90994300, 43851500, 43787400, 46139600, 72261700...
$ adjusted           <dbl> 51.33858, 51.30145, 51.73770, 51.90478, 51.85837...
$ runMaxHigh         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ Max15              <dbl> 61.689, 61.786, 61.786, 61.786, 61.786, 61.786, ...
$ runMinLow          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ Min15              <dbl> 59.168, 59.330, 59.599, 59.599, 59.675, 59.675, ...
$ lead15             <dbl> 61.388, 61.786, 61.646, 60.504, 61.603, 61.593, ...
$ percentchange15    <dbl> 2.32, 3.24, 2.62, 0.36, 2.02, 1.76, 1.38, 1.52, ...
$ openpricetomorrow  <dbl> 59.804, 59.362, 60.117, 60.354, 60.063, 60.666, ...
$ percentchangeMax15 <dbl> 3.15, 4.08, 2.78, 2.37, 2.87, 1.85, 1.63, 1.45, ...
$ label_twopercent   <fctr> class1, class1, class1, class1, class1, class0,...

In [216]:
# create subset of all columns except leading. i don't want them leaking future data
stdf <- dplyr::select(stdf, -dplyr::one_of(
    c("percentchangeMax15","openpricetomorrow","lead15","Max15","Min15","percentchange15")
))

glimpse(stdf)


Observations: 1,686
Variables: 10
$ date             <date> 2011-01-03, 2011-01-04, 2011-01-05, 2011-01-06, 2...
$ open             <dbl> 59.222, 59.804, 59.362, 60.117, 60.354, 60.063, 60...
$ high             <dbl> 59.998, 59.847, 60.073, 60.289, 60.386, 60.526, 60...
$ low              <dbl> 59.201, 59.168, 59.330, 59.987, 59.599, 59.880, 60...
$ close            <dbl> 55.31, 55.27, 55.74, 55.92, 55.87, 56.08, 56.16, 5...
$ volume           <dbl> 90994300, 43851500, 43787400, 46139600, 72261700, ...
$ adjusted         <dbl> 51.33858, 51.30145, 51.73770, 51.90478, 51.85837, ...
$ runMaxHigh       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ runMinLow        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ label_twopercent <fctr> class1, class1, class1, class1, class1, class0, c...

In [217]:
#stdf$label_twopercent <- ifelse(stdf$percentchangeMax15 >= 2.0, 1,0) #will the price jump up 6% sometime withing the next 3 weeks

#https://www.rdocumentation.org/packages/dplyr/versions/0.7.2/topics/case_when

#stdf$label_twopercent <- case_when(
    #stdf$percentchangeMax15 < -2 ~ -2,
     #  stdf$percentchangeMax15 < 0 & stdf$percentchangeMax15 >= -2 ~ -1,
      #     stdf$percentchangeMax15 >=0 & stdf$percentchangeMax15 < 1 ~ 0,
      #stdf$percentchangeMax15 >= 1 & stdf$percentchangeMax15 < 2 ~ 1,
      #stdf$percentchangeMax15 >= 2 & stdf$percentchangeMax15 < 3 ~ 2,
      #stdf$percentchangeMax15 >= 3 & stdf$percentchangeMax15 < 4 ~ 3,
       # stdf$percentchangeMax15 >= 4 & stdf$percentchangeMax15 < 5 ~ 4,
      # stdf$percentchangeMax15 >= 5 & stdf$percentchangeMax15 < 6 ~ 5,
    # stdf$percentchangeMax15 >= 6 & stdf$percentchangeMax15 < 7 ~ 6,
    # stdf$percentchangeMax15 >= 7 & stdf$percentchangeMax15 < 8 ~ 7,
    # stdf$percentchangeMax15 >= 8 ~ 8
    
    #stdf$percentchangeMax15 < 2 ~ 0,
    #stdf$percentchangeMax15 >= 2 ~ 1
    #)

#stdf$label_twopercent <- as.factor(stdf$label_twopercent)

In [218]:
# target is na for the last n observations. i'll separate those after creating indicators and at the very end predict on them

In [219]:
#create predictors

stdf <- mutate(stdf,
    #create date features
    dayofweek = wday(date),
    dayofmonth = mday(date),
    dayofyear = yday(date),
    weekofyear = week(date),
    monthofyear = month(date),
    year = year(date),
               
    #EMA's
    EMA200 = TTR::EMA(high, n=200, ),
    EMA100 = TTR::EMA(high, n=100, ),
    EMA50 = TTR::EMA(high, n=50, ),
    EMA20 = TTR::EMA(high, n=20, ),
    EMA10 = TTR::EMA(high, n=10, ),
    EMA5 = TTR::EMA(high, n=5, ),
               
    # price over moving avg?
    PoM200 = ifelse(high >= EMA200, 1, 0),
    PoM100 = ifelse(high >= EMA100, 1, 0),
    PoM50 = ifelse(high >= EMA50, 1, 0),
    PoM20 = ifelse(high >= EMA20, 1, 0),
    PoM10 = ifelse(high >= EMA10, 1, 0),
    PoM5 = ifelse(high >= EMA5, 1, 0),

               # moving avg over moving avg?
    #if 100 over 200
    MoM100200 = ifelse(EMA100 >= EMA200, 1, 0),
    #if 50 over 200
    MoM50200 = ifelse(EMA50 >= EMA200, 1, 0),
    #if 20 over 200
    MoM20200 = ifelse(EMA20 >= EMA200, 1, 0),
    #if 20 over 50
    MoM50 = ifelse(EMA20 >= EMA50, 1, 0),
    #if 10 over 20
    MoM20 = ifelse(EMA10 >= EMA20, 1, 0),
    #if 5 over 10
    MoM10 = ifelse(EMA5 >= EMA10, 1, 0),
    #if 5 over 50
    MoM550 = ifelse(EMA5 >= EMA50, 1, 0),
    #if 5 over 20
    MoM520 = ifelse(EMA5 >= EMA20, 1, 0),
               
    #slope EMA 200
    lagEMA200 = lag(EMA200, n=1, order_by = date), #only used to calc slope
    EMA200Slope = (EMA200 - lagEMA200), # slope number
    EMA200SlopePN = ifelse(EMA200Slope > 0, 1,0), #says whether slope is positive or negative
    lagEMA200SlopePN = lag(EMA200Slope, n=1, order_by = date), #used just for slope change pos to neg
    EMA200SlopePNchange = ifelse(EMA200SlopePN > lagEMA200SlopePN, 1,0), #says whether slope changed from negative to positive. 1 if slope just changed positive

    #slope EMA 50
    lagEMA50 = lag(EMA50, n=1, order_by = date), #only used to calc slope
    EMA50Slope = (EMA50 - lagEMA50), # slope number
    EMA50SlopePN = ifelse(EMA50Slope > 0, 1,0), #says whether slope is positive or negative
    lagEMA50SlopePN = lag(EMA50Slope, n=1, order_by = date), #used just for slope change pos to neg
    EMA50SlopePNchange = ifelse(EMA50SlopePN > lagEMA50SlopePN, 1,0), #says whether slope changed from negative to positive. 1 if slope just changed positive


    #slope EMA 20
    lagEMA20 = lag(EMA20, n=1, order_by = date), #only used to calc slope
    EMA20Slope = (EMA20 - lagEMA20), # slope number
    EMA20SlopePN = ifelse(EMA20Slope > 0, 1,0), #says whether slope is positive or negative
    lagEMA20SlopePN = lag(EMA20Slope, n=1, order_by = date), #used just for slope change pos to neg
    EMA20SlopePNchange = ifelse(EMA20SlopePN > lagEMA20SlopePN, 1,0), #says whether slope changed from negative to positive. 1 if slope just changed positive

    #slope EMA 10
    lagEMA10 = lag(EMA10, n=1, order_by = date), #only used to calc slope
    EMA10Slope = (EMA10 - lagEMA10), # slope number
    EMA10SlopePN = ifelse(EMA10Slope > 0, 1,0), #says whether slope is positive or negative
    lagEMA10SlopePN = lag(EMA10Slope, n=1, order_by = date), #used just for slope change pos to neg
    EMA10SlopePNchange = ifelse(EMA10SlopePN > lagEMA10SlopePN, 1,0), #says whether slope changed from negative to positive. 1 if slope just changed positive

    # new high in past n days
               # if newhighndays is 1, then stock has reached a new high in last n days. if 0 it hasn't.
    #rolling window max high price for last n days
    runMaxHighndays = runMax(high, n = 15, cumulative = F),
    #if today's high is greater than yesterday's running max high, it's a new high
    newhighYN = ifelse(high > lag(runMaxHighndays, n=1, order_by = date), 1,0),
    #if sum running max is > 0, it's reached a new high in the last n days
    newhighndays = runMax(newhighYN, n = 15, cumulative = F), 

    # new low in past n days
    #rolling window max low price for last n days
    runMinLowndays = runMin(low, n = 15, cumulative = F),
    #if today's high is greater than yesterday's running max high, it's a new high
    newlowYN = ifelse(low < lag(runMinLowndays, n=1, order_by = date), 1,0),
    #if sum running max is > 0, it's reached a new high in the last n days
    newlowndays = runMin(newlowYN, n = 15, cumulative = F),
    
     #Rate of Change (ROC)
    roc = ROC(high, n=10, type = c("continuous"), na.pad = TRUE),
    #RSI
    rsi = RSI(high, n=14),
    #On-balance volume (OBV)
    obv = OBV(high, volume),
               
    # volatility
    volatility = volatility(high, n = 10, calc = "close", N = 260, mean0 = FALSE)
               
               )

glimpse(stdf)

tail(stdf$newhighndays)
tail(stdf$newlowndays)


Observations: 1,686
Variables: 66
$ date                <date> 2011-01-03, 2011-01-04, 2011-01-05, 2011-01-06...
$ open                <dbl> 59.222, 59.804, 59.362, 60.117, 60.354, 60.063,...
$ high                <dbl> 59.998, 59.847, 60.073, 60.289, 60.386, 60.526,...
$ low                 <dbl> 59.201, 59.168, 59.330, 59.987, 59.599, 59.880,...
$ close               <dbl> 55.31, 55.27, 55.74, 55.92, 55.87, 56.08, 56.16...
$ volume              <dbl> 90994300, 43851500, 43787400, 46139600, 7226170...
$ adjusted            <dbl> 51.33858, 51.30145, 51.73770, 51.90478, 51.8583...
$ runMaxHigh          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ runMinLow           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ label_twopercent    <fctr> class1, class1, class1, class1, class1, class0...
$ dayofweek           <dbl> 2, 3, 4, 5, 6, 2, 3, 4, 5, 6, 3, 4, 5, 6, 2, 3,...
$ dayofmonth          <int> 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 20, ...
$ dayofyear           <dbl> 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 20, ...
$ weekofyear          <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4,...
$ monthofyear         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ year                <dbl> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,...
$ EMA200              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA100              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA50               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA20               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA10               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 60.53230, 6...
$ EMA5                <dbl> NA, NA, NA, NA, 60.11860, 60.25440, 60.40960, 6...
$ PoM200              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ PoM100              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ PoM50               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ PoM20               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ PoM10               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, 1, 0,...
$ PoM5                <dbl> NA, NA, NA, NA, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0...
$ MoM100200           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ MoM50200            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ MoM20200            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ MoM50               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ MoM20               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ MoM10               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, 1, 1,...
$ MoM550              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ MoM520              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA200           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA200Slope         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA200SlopePN       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA200SlopePN    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA200SlopePNchange <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA50            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA50Slope          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA50SlopePN        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA50SlopePN     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA50SlopePNchange  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA20            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA20Slope          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA20SlopePN        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA20SlopePN     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA20SlopePNchange  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA10            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 60.5323...
$ EMA10Slope          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.20449...
$ EMA10SlopePN        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, 0...
$ lagEMA10SlopePN     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.2...
$ EMA10SlopePNchange  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, ...
$ runMaxHighndays     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ newhighYN           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ newhighndays        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ runMinLowndays      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ newlowYN            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ newlowndays         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ roc                 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.02727...
$ rsi                 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ obv                 <dbl> 90994300, 47142800, 90930200, 137069800, 209331...
$ volatility          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.03481905,...
  1. 1
  2. 1
  3. 1
  4. 1
  5. 1
  6. 1
  1. 0
  2. 0
  3. 0
  4. 0
  5. 0
  6. 0

In [220]:
# I can leave the date features numeric but need to create char versions with dummies also


#create char versions
#stdf$c_dayofweek = as.character(stdf$dayofweek)
#stdf$c_dayofmonth = as.character(stdf$dayofmonth)
#stdf$c_dayofyear = as.character(stdf$dayofyear)
#stdf$c_weekofyear = as.character(stdf$weekofyear)
#stdf$c_monthofyear = as.character(stdf$monthofyear)
#stdf$c_year = as.character(stdf$year)



#create dummy variables 
#dmy <- caret::dummyVars(" ~ c_dayofweek + c_dayofmonth + c_weekofyear + c_monthofyear + c_year + c_dayofyear", data = stdf, fullRank = T)
#dataset_d <- data.frame(predict(dmy, newdata = stdf))

# recombine data
#stdf <- cbind(stdf, dataset_d)

#dim(stdf)
#glimpse(stdf)

In [221]:
#these indicators create multiple columns

#MACD. create predictors macd and signal
macd <- TTR::MACD(stdf$high, 12, 26, 9, maType="EMA" )
stdf <- cbind(stdf, macd)
stdf <- mutate(stdf,
    macddir = ifelse(macd > signal, 1, 0)
)

#Aroon
aroon <- aroon( stdf[,c("high", "low")], n=20 )
stdf <- cbind(stdf, aroon)
stdf <- mutate(stdf,
    aroondir = ifelse(aroonUp > aroonDn, 1, 0)
)

glimpse(stdf)


Observations: 1,686
Variables: 73
$ date                <date> 2011-01-03, 2011-01-04, 2011-01-05, 2011-01-06...
$ open                <dbl> 59.222, 59.804, 59.362, 60.117, 60.354, 60.063,...
$ high                <dbl> 59.998, 59.847, 60.073, 60.289, 60.386, 60.526,...
$ low                 <dbl> 59.201, 59.168, 59.330, 59.987, 59.599, 59.880,...
$ close               <dbl> 55.31, 55.27, 55.74, 55.92, 55.87, 56.08, 56.16...
$ volume              <dbl> 90994300, 43851500, 43787400, 46139600, 7226170...
$ adjusted            <dbl> 51.33858, 51.30145, 51.73770, 51.90478, 51.8583...
$ runMaxHigh          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ runMinLow           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ label_twopercent    <fctr> class1, class1, class1, class1, class1, class0...
$ dayofweek           <dbl> 2, 3, 4, 5, 6, 2, 3, 4, 5, 6, 3, 4, 5, 6, 2, 3,...
$ dayofmonth          <int> 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 20, ...
$ dayofyear           <dbl> 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 20, ...
$ weekofyear          <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4,...
$ monthofyear         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ year                <dbl> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,...
$ EMA200              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA100              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA50               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA20               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA10               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 60.53230, 6...
$ EMA5                <dbl> NA, NA, NA, NA, 60.11860, 60.25440, 60.40960, 6...
$ PoM200              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ PoM100              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ PoM50               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ PoM20               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ PoM10               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, 1, 0,...
$ PoM5                <dbl> NA, NA, NA, NA, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0...
$ MoM100200           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ MoM50200            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ MoM20200            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ MoM50               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ MoM20               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ MoM10               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, 1, 1,...
$ MoM550              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ MoM520              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA200           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA200Slope         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA200SlopePN       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA200SlopePN    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA200SlopePNchange <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA50            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA50Slope          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA50SlopePN        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA50SlopePN     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA50SlopePNchange  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA20            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA20Slope          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA20SlopePN        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA20SlopePN     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ EMA20SlopePNchange  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ lagEMA10            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 60.5323...
$ EMA10Slope          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.20449...
$ EMA10SlopePN        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, 0...
$ lagEMA10SlopePN     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.2...
$ EMA10SlopePNchange  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, ...
$ runMaxHighndays     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ newhighYN           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ newhighndays        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ runMinLowndays      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ newlowYN            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ newlowndays         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ roc                 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.02727...
$ rsi                 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ obv                 <dbl> 90994300, 47142800, 90930200, 137069800, 209331...
$ volatility          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.03481905,...
$ macd                <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ signal              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ macddir             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ aroonUp             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ aroonDn             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ oscillator          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ aroondir            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...

In [222]:
#add rolling recent high feature
#add crossovers of price and ema's and ema's crossing themselfves
#crossover of macd and its signal

In [223]:
visdat::vis_miss(stdf, cluster = T, sort_miss = T)



In [224]:
# create a numeric dataset
n_dataset <- dplyr::select_if(stdf, is.numeric)


#psych::pairs.panels(dplyr::select(n_dataset, 1:10), pch = ".", hist.col = "darkgreen", ellipse = F, lm = T)
#psych::pairs.panels(dplyr::select(n_dataset, 11:20), pch = ".", hist.col = "darkgreen", ellipse = F, lm = T)

In [225]:
tail(n_dataset)


openhighlowclosevolumeadjustedrunMaxHighrunMinLowdayofweekdayofmonth...rsiobvvolatilitymacdsignalmacddiraroonUparoonDnoscillatoraroondir
1681145.06 145.46 144.08 145.13 30722700 145.13 146.59 140.18 4 6 ... 57.31598 457475600 0.089481200.4404883 0.2691076 1 90 45 45 1
1682145.40 145.84 144.94 145.47 22721100 145.47 146.59 140.18 5 7 ... 58.90521 480196700 0.089423440.4708558 0.3094573 1 85 40 45 1
1683145.28 145.35 144.06 144.21 27815800 144.21 146.59 140.18 6 8 ... 56.00934 452380900 0.093703420.4622141 0.3400086 1 80 35 45 1
1684145.30 146.12 145.30 145.87 27126300 145.87 146.59 140.18 2 11 ... 59.38811 479507200 0.090658300.4925035 0.3705076 1 75 30 45 1
1685146.25 146.41 145.53 146.22 30256700 146.22 146.59 140.18 3 12 ... 60.61504 509763900 0.090608190.5264444 0.4016950 1 70 25 45 1
1686145.97 146.44 145.73 146.42 28200400 146.42 146.59 141.06 4 13 ... 60.74715 537964300 0.071730030.5485590 0.4310678 1 65 20 45 1

In [226]:
PerformanceAnalytics::chart.Correlation(dplyr::select(n_dataset, dayofyear, obv), histogram=TRUE, pch=".")



In [227]:
# target percentages.
stdf %>%
  dplyr::group_by(label_twopercent) %>%
  dplyr::summarise(count = n() / nrow(.) )


label_twopercentcount
class0 0.407473310
class1 0.583036773
NA 0.009489917

If I guessed YES that the stock will hit a high of n% for each observation, I would be right class1 % of the time. I really care about guessing YES and the stock not hitting my target. So I will focus on negative predictive value.

So the really question I want answered is: When I predict a YES, how accurate am I?


In [228]:
#clip the first 50 records, since they have na's
#clip the last 15, since can't be used to train
dim(stdf)


  1. 1686
  2. 73

In [229]:
nd <- nrow(stdf) #number of rows
st <- nrow(stdf) - 15  # number of rows minus 15


st
nd

exc <- dplyr::slice(stdf, 1:st) # subset to train on all except last 15
exend <- dplyr::slice(stdf, st+1:nd) # subset of last 15 records to score at the end. +1 is so it doesn't include the last row of the exc dataset

dim(exc)
tail(exc)
tail(exend, n = 15)


1671
1686
  1. 1671
  2. 73
dateopenhighlowclosevolumeadjustedrunMaxHighrunMinLowlabel_twopercent...rsiobvvolatilitymacdsignalmacddiraroonUparoonDnoscillatoraroondir
2017-08-15144.29 144.30 143.64 144.03 25509200 144.03 145.96 140.89 class0 ... 54.74437 490447700 0.1094365 0.388781100.5401705 0 35 85 -50 0
2017-08-16144.39 144.96 143.80 144.28 31913000 144.28 145.96 140.89 class1 ... 57.58516 522360700 0.1111873 0.414652970.5150670 0 30 80 -50 0
2017-08-17143.72 143.94 141.33 141.33 70191000 141.33 145.96 140.89 class1 ... 52.13809 452169700 0.1184406 0.373670250.4867877 0 25 75 -50 0
2017-08-18141.41 142.20 140.65 141.23 60751900 141.23 145.47 140.65 class1 ... 44.41917 391417800 0.1315811 0.240853240.4376008 0 20 100 -80 0
2017-08-21141.22 141.53 140.18 141.05 37111500 141.05 145.47 140.18 class1 ... 41.84992 354306300 0.1225336 0.096684020.3694174 0 15 100 -85 0
2017-08-22141.81 143.35 141.76 143.20 36294200 143.20 145.47 140.18 NA ... 50.26535 390600500 0.1427631 0.083744370.3122828 0 10 95 -85 0
dateopenhighlowclosevolumeadjustedrunMaxHighrunMinLowlabel_twopercent...rsiobvvolatilitymacdsignalmacddiraroonUparoonDnoscillatoraroondir
2017-08-23 142.48 143.01 142.38 142.69 25702300 142.69 145.47 140.18 NA ... 48.84327 364898200 0.14028578 0.0537372910.2605737 0 5 90 -85 0
2017-08-24 143.01 143.17 141.47 142.27 38160800 142.27 145.47 140.18 NA ... 49.56637 403059000 0.13448212 0.0385143030.2161618 0 0 85 -85 0
2017-08-25 142.82 143.18 141.78 141.97 26380400 141.97 145.47 140.18 NA ... 49.61431 429439400 0.11502228 0.0267040240.1782703 0 35 80 -45 0
2017-08-28 142.40 142.74 141.95 142.41 15362100 142.41 145.47 140.18 NA ... 47.47605 414077300 0.11506869 -0.0073547050.1411453 0 30 75 -45 0
2017-08-29 141.19 143.21 141.06 142.97 24498300 142.97 145.47 140.18 NA ... 49.95705 438575600 0.11317755 -0.0077857920.1113591 0 25 70 -45 0
2017-08-30 143.09 144.89 143.00 144.65 33642700 144.65 144.96 140.18 NA ... 57.65639 472218300 0.12643469 0.0854560250.1061785 0 20 65 -45 0
2017-08-31 145.06 146.21 144.98 146.20 38480700 146.20 146.21 140.18 NA ... 62.53390 510699000 0.10590157 0.2306414600.1310711 1 100 60 40 1
2017-09-01 146.39 146.59 145.61 146.00 22293600 146.00 146.59 140.18 NA ... 63.82573 532992600 0.09510997 0.3624101540.1773389 1 100 55 45 1
2017-09-05 145.57 145.88 143.60 144.69 44794300 144.69 146.59 140.18 NA ... 59.68488 488198300 0.08863772 0.4219567380.2262624 1 95 50 45 1
2017-09-06 145.06 145.46 144.08 145.13 30722700 145.13 146.59 140.18 NA ... 57.31598 457475600 0.08948120 0.4404883060.2691076 1 90 45 45 1
2017-09-07 145.40 145.84 144.94 145.47 22721100 145.47 146.59 140.18 NA ... 58.90521 480196700 0.08942344 0.4708557950.3094573 1 85 40 45 1
2017-09-08 145.28 145.35 144.06 144.21 27815800 144.21 146.59 140.18 NA ... 56.00934 452380900 0.09370342 0.4622141380.3400086 1 80 35 45 1
2017-09-11 145.30 146.12 145.30 145.87 27126300 145.87 146.59 140.18 NA ... 59.38811 479507200 0.09065830 0.4925035290.3705076 1 75 30 45 1
2017-09-12 146.25 146.41 145.53 146.22 30256700 146.22 146.59 140.18 NA ... 60.61504 509763900 0.09060819 0.5264443690.4016950 1 70 25 45 1
2017-09-13 145.97 146.44 145.73 146.42 28200400 146.42 146.59 141.06 NA ... 60.74715 537964300 0.07173003 0.5485590240.4310678 1 65 20 45 1

In [230]:
# get rid of any rows with na's in the training set
exc<-na.omit(exc)
dim(exc)
head(exc)


  1. 1469
  2. 73
dateopenhighlowclosevolumeadjustedrunMaxHighrunMinLowlabel_twopercent...rsiobvvolatilitymacdsignalmacddiraroonUparoonDnoscillatoraroondir
2011-10-19 61.790 61.983 60.730 56.87 67639400 53.07784 62.455 53.679 class1 ... 61.11970 -9697433000.1596754 1.381602 0.7064756 1 95 45 50 1
2011-10-20 60.922 60.987 59.776 56.59 80378300 52.81651 62.455 53.679 class1 ... 54.97191 -10501216000.2022296 1.294486 0.8240776 1 90 40 50 1
2011-10-21 61.276 61.780 60.837 57.30 60455800 53.47916 62.455 53.679 class1 ... 58.54704 -9896658000.1775139 1.315288 0.9223197 1 85 35 50 1
2011-10-24 61.619 62.840 61.405 58.49 61449500 54.58982 62.840 53.679 class0 ... 62.79892 -9282163000.1903178 1.454416 1.0287389 1 100 30 70 1
2011-10-25 62.465 62.487 61.308 57.34 65299200 53.51651 62.840 53.679 class1 ... 60.57078 -9935155000.1876989 1.499739 1.1229390 1 95 25 70 1
2011-10-26 61.780 61.908 60.226 57.27 98193800 53.45116 62.840 55.351 class0 ... 56.99853 -10917093000.1973241 1.442433 1.1868377 1 90 20 70 1

In [231]:
# Split out validation dataset
validation_index <- createDataPartition(exc$label_twopercent, p = 0.67, list = FALSE)
validation <- exc[-validation_index,]
sktrain <- exc[validation_index,]

In [ ]:


In [232]:
# set up k-fold cross validation and metric
control <- trainControl(method = "cv", number = 10, sampling = "up"
                        , summaryFunction=twoClassSummary
                        , classProbs = TRUE
                       )
metric <- "ROC"#"Accuracy"

In [233]:
formula <- label_twopercent ~ volume + rsi + PoM50 + dayofweek + dayofmonth + dayofyear + weekofyear + year + MoM20 +
    PoM20 + PoM10 + PoM5 + MoM50 + lagEMA20 + EMA20Slope + EMA20SlopePN + lagEMA20SlopePN + EMA20SlopePNchange + 
    macd + signal + roc + obv + aroonUp + aroonDn + oscillator + runMaxHigh + runMinLow + aroondir + macddir + 
    MoM10 + MoM550 + MoM520 + lagEMA10 + EMA10Slope + EMA10SlopePN + lagEMA10SlopePN + EMA10SlopePNchange + newhighYN +
    newhighndays + runMaxHighndays + runMinLowndays + newlowYN + newlowndays + runMaxHigh/runMinLow + dayofyear/dayofmonth +
    dayofyear/dayofweek + obv/dayofyear + obv/signal + dayofyear/signal + obv/roc + obv/EMA10Slope + EMA200 +
    EMA100 + PoM200 + PoM100 + MoM100200 + MoM50200 + MoM20200 + lagEMA200 + EMA200Slope + EMA200SlopePN +
    lagEMA200SlopePN + EMA200SlopePNchange + lagEMA50 + EMA50Slope + EMA50SlopePN + lagEMA50SlopePN + EMA50SlopePNchange +
    volatility + volatility/obv + volatility/dayofyear

In [234]:
# create formula and set for using all the dummies
#labely <- sktrain$label_twopercent
#sktrain <- dplyr::select_if(sktrain, is.numeric)
#sktrain <- cbind(labely, sktrain)
#formula <- labely ~ .

In [235]:
set.seed(13)
fit.c5 <- train(formula, data = sktrain, method = "C5.0", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# GLMNET
set.seed(13)
#fit.glmnet <- train(formula, data = sktrain, method = "glmnet", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# KNN
set.seed(13)
#fit.knn <- train(formula, data = sktrain, method = "knn", preProcess = c('zv', "center", "scale",'medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# SVM
set.seed(13)
#fit.svm <- train(formula, data = sktrain, method = "svmRadial", preProcess = c('center','scale','zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
#xgb
set.seed(13); fit.xgb <- train(formula, data = sktrain, method = "xgbTree", preProcess =c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
#xgblinear
set.seed(13); fit.xgbLinear <- train(formula, data = sktrain, method = "xgbLinear", preProcess =c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)

In [236]:
fit.c5
#fit.svm


C5.0 

985 samples
 61 predictor
  2 classes: 'class0', 'class1' 

Pre-processing: median imputation (70), Box-Cox transformation (22), remove (1) 
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 886, 887, 886, 886, 887, 886, ... 
Addtional sampling using up-sampling prior to pre-processing

Resampling results across tuning parameters:

  model  winnow  trials  ROC        Sens       Spec     
  rules  FALSE    1      0.7656927  0.7325784  0.7385965
  rules  FALSE   10      0.8675123  0.7443089  0.8315789
  rules  FALSE   20      0.8804552  0.7420441  0.8543860
  rules   TRUE    1      0.7480449  0.7108014  0.7333333
  rules   TRUE   10      0.8565306  0.7133566  0.8333333
  rules   TRUE   20      0.8706711  0.7568525  0.8368421
  tree   FALSE    1      0.7851494  0.7253775  0.7473684
  tree   FALSE   10      0.8704098  0.7542393  0.8140351
  tree   FALSE   20      0.8874564  0.7782230  0.8438596
  tree    TRUE    1      0.8130305  0.7349013  0.7859649
  tree    TRUE   10      0.8724550  0.7905343  0.8105263
  tree    TRUE   20      0.8869093  0.7520325  0.8421053

ROC was used to select the optimal model using  the largest value.
The final values used for the model were trials = 20, model = tree and winnow
 = FALSE.

In [237]:
# Compare results
results <- resamples(list(
    C5 = fit.c5,
    #GLMNET = fit.glmnet,
    #KNN = fit.knn,
    xgb = fit.xgb,
    XGBLinear = fit.xgbLinear
    #SVM = fit.svm
))

summary(results)

# view plots of model results
bwplot(results)
dotplot(results)

# view variable importancs
#varImp(fit.glmnet)
#varImp(fit.svm)
#varImp(fit.xgb)

#plot(varImp(fit.svm), top = 20)
plot(varImp(fit.xgb), top = 20)


Call:
summary.resamples(object = results)

Models: C5, xgb, XGBLinear 
Number of resamples: 10 

ROC 
            Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
C5        0.7962  0.8816 0.8993 0.8875  0.9141 0.9332    0
xgb       0.8400  0.8820 0.9009 0.8954  0.9122 0.9449    0
XGBLinear 0.8467  0.8956 0.8998 0.8998  0.9199 0.9336    0

Sens 
            Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
C5        0.6429  0.7561 0.7712 0.7782  0.8243 0.8810    0
xgb       0.6829  0.7666 0.7831 0.7856  0.8243 0.8780    0
XGBLinear 0.7381  0.7666 0.7953 0.8073  0.8049 0.9524    0

Spec 
            Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
C5        0.7544  0.8421 0.8509 0.8439  0.8728 0.8772    0
xgb       0.7895  0.8114 0.8333 0.8421  0.8596 0.9474    0
XGBLinear 0.7895  0.8246 0.8333 0.8474  0.8728 0.9298    0

In [238]:
#test predicting power on unseen validation set
validation$prediction <- predict(fit.xgbLinear, newdata = validation, na.action = na.pass)

#Check the accuracy with a confusion matrix
confusionMatrix(validation$prediction, validation$label_twopercent, positive = "class0") #convention is positive class is the rarest one


Confusion Matrix and Statistics

          Reference
Prediction class0 class1
    class0    165     43
    class1     39    237
                                          
               Accuracy : 0.8306          
                 95% CI : (0.7941, 0.8629)
    No Information Rate : 0.5785          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.6535          
 Mcnemar's Test P-Value : 0.7404          
                                          
            Sensitivity : 0.8088          
            Specificity : 0.8464          
         Pos Pred Value : 0.7933          
         Neg Pred Value : 0.8587          
             Prevalence : 0.4215          
         Detection Rate : 0.3409          
   Detection Prevalence : 0.4298          
      Balanced Accuracy : 0.8276          
                                          
       'Positive' Class : class0          
                                          

In [239]:
validation


dateopenhighlowclosevolumeadjustedrunMaxHighrunMinLowlabel_twopercent...obvvolatilitymacdsignalmacddiraroonUparoonDnoscillatoraroondirprediction
2011-10-20 60.922 60.987 59.776 56.59 80378300 52.81651 62.455 53.679 class1 ... -10501216000.20222955 1.2944856 0.824077631 90 40 50 1 class1
2011-10-24 61.619 62.840 61.405 58.49 61449500 54.58982 62.840 53.679 class0 ... -9282163000.19031778 1.4544155 1.028738881 100 30 70 1 class1
2011-10-26 61.780 61.908 60.226 57.27 98193800 53.45116 62.840 55.351 class0 ... -10917093000.19732407 1.4424326 1.186837691 90 20 70 1 class1
2011-10-27 62.830 63.430 62.176 58.85 104455500 54.92582 63.430 57.012 class0 ... -9872538000.21831081 1.5778117 1.265032501 100 15 85 1 class1
2011-11-01 60.558 62.047 60.130 56.44 99933000 52.67652 63.430 59.679 class1 ... -11906570000.23044586 1.4779623 1.411430961 85 0 85 1 class1
2011-11-04 61.951 62.197 61.308 57.80 61527800 53.94582 63.430 59.776 class1 ... -12381944000.21264268 1.1260423 1.299645820 70 0 70 1 class0
2011-11-07 61.887 62.412 61.255 58.21 65559000 54.32848 63.430 59.776 class0 ... -11726354000.21201263 1.0840847 1.256533600 65 0 65 1 class0
2011-11-21 58.554 58.640 57.622 54.34 52283700 50.71654 63.194 57.622 class1 ... -14660130000.19798775 -0.2357549 0.496686380 15 100 -85 0 class1
2011-11-23 57.976 58.040 57.065 53.29 48437300 49.73656 63.194 57.065 class1 ... -15663904000.18915460 -0.8913937 0.049129040 5 100 -95 0 class1
2011-11-25 56.851 57.504 56.647 52.88 25448100 49.35390 63.194 56.647 class1 ... -15918385000.15933217 -1.2101357 -0.202723910 0 100 -100 0 class1
2011-12-09 60.280 61.276 60.258 57.02 55757900 53.21784 61.737 56.647 class0 ... -13825401000.13712130 0.0829594 -0.258868091 15 50 -35 0 class0
2011-12-12 60.451 61.083 59.894 56.38 55411700 52.62052 61.737 56.647 class0 ... -14379518000.14329760 0.1033070 -0.186433081 10 45 -35 0 class1
2011-12-19 58.831 58.970 57.869 54.32 46733900 50.84743 61.737 57.869 class1 ... -16146802000.12684994 -0.5281527 -0.256245380 50 20 30 1 class1
2011-12-21 59.397 59.440 58.211 55.13 78552300 51.60565 61.737 57.869 class1 ... -16114737000.16472813 -0.5770153 -0.364248180 40 10 30 1 class1
2011-12-29 59.431 59.804 59.239 55.99 28077900 52.45650 61.437 57.869 class1 ... -16700277000.11600124 -0.4202588 -0.441048941 15 65 -50 0 class1
2012-01-03 60.743 61.042 60.573 56.90 39514100 53.30908 61.276 57.869 class1 ... -16086812000.13864116 -0.2185220 -0.389136351 5 55 -50 0 class1
2012-01-09 61.864 61.886 61.299 57.62 39195500 53.98363 61.886 57.869 class1 ... -14524965000.11065166 0.4170717 -0.026090481 100 35 65 1 class1
2012-01-12 62.216 62.409 61.757 58.39 26188300 54.70504 62.409 58.211 class1 ... -14275024000.09963956 0.7834547 0.327063171 100 20 80 1 class1
2012-01-13 61.992 62.120 61.565 58.18 35980400 54.50829 62.409 58.211 class1 ... -14634828000.11102668 0.8140852 0.424467571 95 15 80 1 class1
2012-01-18 62.782 63.497 62.633 59.49 48692700 55.73563 63.497 59.174 class1 ... -13729993000.09011154 1.0887558 0.638793901 100 5 95 1 class1
2012-01-23 63.807 64.266 63.444 59.79 40958300 56.01669 64.266 59.239 class1 ... -13253082000.09819634 1.4752341 1.001281041 100 0 100 1 class1
2012-01-25 64.319 64.661 63.860 60.43 61591800 56.61630 64.661 60.370 class1 ... -13021566000.11729674 1.5975564 1.199965791 100 10 90 1 class1
2012-01-30 63.924 64.618 63.679 60.45 42797800 56.63504 64.896 61.266 class1 ... -12670384000.10624210 1.6881151 1.439195181 90 0 90 1 class1
2012-02-07 66.123 66.443 65.824 62.13 37935700 58.20902 66.443 62.430 class1 ... -10772203000.07573786 1.9676830 1.757968861 100 0 100 1 class1
2012-02-08 66.326 66.667 66.080 62.46 41171100 58.51819 66.667 62.633 class1 ... -10360492000.07568267 2.0033845 1.807052001 100 0 100 1 class1
2012-02-10 66.667 66.859 66.443 62.47 58090500 58.52756 67.222 63.412 class1 ... -10472833000.08112835 2.0621742 1.900954861 95 5 90 1 class1
2012-02-22 67.767 67.970 67.489 63.32 43174700 59.32391 68.194 64.191 class1 ... -10179038000.08971698 2.0611228 2.072611930 95 0 95 1 class1
2012-02-27 67.895 68.589 67.596 64.05 42678500 60.00784 68.589 65.813 class1 ... -8796104000.06373729 1.9373445 2.019309940 100 5 95 1 class1
2012-02-28 68.428 69.058 68.343 64.70 43717500 60.61682 69.058 65.813 class1 ... -8358929000.06863468 1.9481082 2.005069590 100 0 100 1 class1
2012-03-05 69.197 69.282 68.300 64.20 45788800 60.14837 69.464 66.443 class1 ... -7209802000.05237641 1.8873478 1.961873660 95 5 90 1 class1
.............................. ..............................
2017-03-02 132.042 132.062 131.248 130.78 19951400 130.1540 132.313 126.485 class0 ... 25131000 0.06153153 1.4189154 1.401896091 95 5 90 1 class0
2017-03-08 131.218 131.781 131.088 130.74 15776000 130.1142 132.313 128.354 class0 ... 31204600 0.06070134 1.2062941 1.325282240 75 5 70 1 class0
2017-03-09 131.339 131.650 130.766 130.84 20855300 130.2137 132.313 128.987 class0 ... 10349300 0.05716685 1.1435400 1.288933800 70 0 70 1 class0
2017-03-23 130.876 131.237 130.495 130.36 20311700 130.0058 133.047 130.033 class1 ... 49082000 0.08785661 0.6608826 0.915978240 70 5 65 1 class1
2017-04-07 132.360 132.731 131.858 131.97 16743100 131.6114 133.844 129.753 class1 ... 133679100 0.07565833 0.5079876 0.577473670 90 55 35 1 class1
2017-04-10 132.460 132.931 132.019 132.02 15619300 131.6613 133.844 129.753 class1 ... 149298400 0.06601224 0.4825250 0.558483930 85 50 35 1 class1
2017-04-11 132.259 132.460 130.856 131.45 33778400 131.0929 133.844 129.753 class1 ... 115520000 0.06843984 0.4287522 0.532537590 80 45 35 1 class1
2017-04-12 131.788 131.898 131.136 130.92 15955200 130.5643 133.844 129.753 class1 ... 99564800 0.06974607 0.3479304 0.495616150 75 40 35 1 class1
2017-04-17 131.126 131.858 131.076 131.48 13325500 131.1228 133.844 129.753 class1 ... 62556300 0.06861393 0.2208946 0.405985450 65 30 35 1 class1
2017-05-02 137.744 137.865 137.393 137.43 18345100 137.0566 137.865 130.735 class0 ... 242007000 0.06556021 1.0380507 0.696175681 100 40 60 1 class0
2017-05-03 137.494 137.544 136.992 136.99 23827600 136.6178 137.865 130.735 class1 ... 218179400 0.07452137 1.0838608 0.773712701 95 35 60 1 class1
2017-05-16 139.780 140.020 139.369 139.62 21786000 139.2407 140.020 134.846 class1 ... 308120600 0.03592924 1.2526168 1.159794081 100 0 100 1 class1
2017-05-19 138.125 138.797 138.035 137.84 36730100 137.4655 140.020 136.140 class1 ... 247888600 0.07787631 1.0371777 1.131027300 85 0 85 1 class1
2017-05-30 141.535 141.896 141.444 141.34 20306400 140.9560 141.896 136.240 class0 ... 334115200 0.09087732 1.1771986 1.108261851 100 65 35 1 class0
2017-06-07 143.650 143.931 142.988 143.42 18842400 143.0303 144.122 136.240 class0 ... 443966300 0.07466852 1.4138624 1.289609061 95 35 60 1 class0
2017-06-09 144.132 144.292 138.486 139.98 109783700 139.5997 144.292 136.240 class1 ... 579167400 0.05763833 1.4011624 1.331637631 100 25 75 1 class0
2017-06-12 139.178 139.860 137.845 139.23 104454000 138.8517 144.292 137.845 class0 ... 474713400 0.18796721 1.1278509 1.290880280 95 20 75 1 class1
2017-06-14 141.314 141.314 138.998 139.75 60093800 139.3703 144.292 137.845 class1 ... 586000200 0.19466103 0.8411710 1.148246730 85 10 75 1 class0
2017-06-15 138.717 139.659 137.865 139.13 55145200 138.7520 144.292 137.845 class1 ... 530855000 0.18798067 0.6441403 1.047425450 80 5 75 1 class0
2017-06-23 140.540 141.420 140.150 141.24 21654300 141.2400 144.292 137.640 class0 ... 562955800 0.11613826 0.2613293 0.491350910 50 75 -25 0 class0
2017-06-26 142.050 142.290 140.270 140.58 34411100 140.5800 144.292 137.640 class1 ... 597366900 0.11355353 0.3095085 0.454982430 45 70 -25 0 class0
2017-07-03 138.270 138.430 136.100 136.19 32797000 136.1900 142.290 136.100 class1 ... 283517200 0.09911522 -0.1723345 0.146031190 20 100 -80 0 class1
2017-07-05 136.620 137.900 136.160 137.53 42116600 137.5300 142.290 136.100 class1 ... 241400600 0.09677285 -0.2854596 0.059733030 15 95 -80 0 class1
2017-07-18 142.090 143.170 141.640 143.14 25748500 143.1400 143.170 135.800 class0 ... 439163000 0.07973609 0.3025613 -0.021498901 100 60 40 1 class0
2017-07-20 144.350 144.440 143.520 144.17 36139400 144.1700 144.440 135.800 class0 ... 509054700 0.05280637 0.6006741 0.180797121 100 50 50 1 class0
2017-07-27 145.820 145.960 142.300 143.96 80665900 143.9600 145.960 135.800 class0 ... 567225900 0.05241483 0.9703152 0.637897191 100 25 75 1 class0
2017-07-28 143.100 144.080 142.870 143.84 38013200 143.8400 145.960 136.740 class0 ... 529212700 0.09977670 0.9200750 0.694332761 95 20 75 1 class0
2017-08-09 143.120 144.190 142.790 144.12 36169500 144.1200 145.960 142.300 class1 ... 540110000 0.10647857 0.6439247 0.708308170 55 0 55 1 class0
2017-08-15 144.290 144.300 143.640 144.03 25509200 144.0300 145.960 140.890 class0 ... 490447700 0.10943652 0.3887811 0.540170520 35 85 -50 0 class1
2017-08-18 141.410 142.200 140.650 141.23 60751900 141.2300 145.470 140.650 class1 ... 391417800 0.13158114 0.2408532 0.437600780 20 100 -80 0 class1

In [240]:
# tune

xgbgrid = expand.grid(
  nrounds = c(2), # Test 4 values for boosting rounds
  max_depth = c(5, 10, 15), # Test 2 values for tree depth
  eta = c(0.1, 0.01, 0.001, 0.0001), # Test 3 values for learning rate
  gamma = c(0,1, 2, 3),
  colsample_bytree = c(0.4, 0.7, 1.0),
  min_child_weight = c(0.5, 1, 1.5),
  subsample = c(0.7)
)



# set up k-fold cross validation and metric
#control <- trainControl(
#    method = "cv"
#    , number = 10
#    , sampling = "up"
#)
#metric <- "Accuracy"

#xgb
#set.seed(13); fit.xgb <- train(formula, data = sktrain, method = "xgbTree", preProcess =c('medianImpute','BoxCox','zv'), metric = metric, 
#                               trControl = control, na.action = na.pass, tuneGrid=xgbgrid)

In [241]:
fit.xgb


eXtreme Gradient Boosting 

985 samples
 61 predictor
  2 classes: 'class0', 'class1' 

Pre-processing: median imputation (70), Box-Cox transformation (22), remove (1) 
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 886, 887, 886, 886, 887, 886, ... 
Addtional sampling using up-sampling prior to pre-processing

Resampling results across tuning parameters:

  eta  max_depth  colsample_bytree  subsample  nrounds  ROC        Sens     
  0.3  1          0.6               0.50        50      0.7472691  0.6799071
  0.3  1          0.6               0.50       100      0.7792770  0.7156794
  0.3  1          0.6               0.50       150      0.8023901  0.7374564
  0.3  1          0.6               0.75        50      0.7307756  0.6822880
  0.3  1          0.6               0.75       100      0.7714917  0.7015099
  0.3  1          0.6               0.75       150      0.7960908  0.7326365
  0.3  1          0.6               1.00        50      0.7332116  0.6821138
  0.3  1          0.6               1.00       100      0.7651028  0.6916957
  0.3  1          0.6               1.00       150      0.7848641  0.7037166
  0.3  1          0.8               0.50        50      0.7250418  0.6623113
  0.3  1          0.8               0.50       100      0.7758696  0.6916957
  0.3  1          0.8               0.50       150      0.7896255  0.7156794
  0.3  1          0.8               0.75        50      0.7461688  0.7131243
  0.3  1          0.8               0.75       100      0.7768838  0.7087108
  0.3  1          0.8               0.75       150      0.7961173  0.7302555
  0.3  1          0.8               1.00        50      0.7284925  0.6699768
  0.3  1          0.8               1.00       100      0.7716843  0.7111498
  0.3  1          0.8               1.00       150      0.7901639  0.7303717
  0.3  2          0.6               0.50        50      0.8328117  0.7374564
  0.3  2          0.6               0.50       100      0.8446930  0.7375145
  0.3  2          0.6               0.50       150      0.8608849  0.7374564
  0.3  2          0.6               0.75        50      0.8263790  0.7542393
  0.3  2          0.6               0.75       100      0.8561179  0.7760743
  0.3  2          0.6               0.75       150      0.8626026  0.7639954
  0.3  2          0.6               1.00        50      0.8426193  0.7759001
  0.3  2          0.6               1.00       100      0.8665256  0.7878049
  0.3  2          0.6               1.00       150      0.8732792  0.7877468
  0.3  2          0.8               0.50        50      0.8123276  0.7256678
  0.3  2          0.8               0.50       100      0.8397895  0.7567944
  0.3  2          0.8               0.50       150      0.8434817  0.7400697
  0.3  2          0.8               0.75        50      0.8338957  0.7593496
  0.3  2          0.8               0.75       100      0.8616592  0.7831591
  0.3  2          0.8               0.75       150      0.8701602  0.7857724
  0.3  2          0.8               1.00        50      0.8441434  0.7975610
  0.3  2          0.8               1.00       100      0.8611417  0.7808943
  0.3  2          0.8               1.00       150      0.8698138  0.7758420
  0.3  3          0.6               0.50        50      0.8565784  0.7686992
  0.3  3          0.6               0.50       100      0.8686859  0.7832172
  0.3  3          0.6               0.50       150      0.8765053  0.7951800
  0.3  3          0.6               0.75        50      0.8709049  0.7857143
  0.3  3          0.6               0.75       100      0.8813874  0.7930894
  0.3  3          0.6               0.75       150      0.8839742  0.7953542
  0.3  3          0.6               1.00        50      0.8709839  0.7782811
  0.3  3          0.6               1.00       100      0.8876307  0.7903600
  0.3  3          0.6               1.00       150      0.8897396  0.7857143
  0.3  3          0.8               0.50        50      0.8443405  0.7543554
  0.3  3          0.8               0.50       100      0.8742109  0.7711382
  0.3  3          0.8               0.50       150      0.8777747  0.7663763
  0.3  3          0.8               0.75        50      0.8776311  0.7878630
  0.3  3          0.8               0.75       100      0.8812305  0.7710801
  0.3  3          0.8               0.75       150      0.8874493  0.7808362
  0.3  3          0.8               1.00        50      0.8732196  0.7881533
  0.3  3          0.8               1.00       100      0.8906219  0.7956446
  0.3  3          0.8               1.00       150      0.8927685  0.7907085
  0.4  1          0.6               0.50        50      0.7698657  0.7063298
  0.4  1          0.6               0.50       100      0.7890397  0.7135308
  0.4  1          0.6               0.50       150      0.8096175  0.7235772
  0.4  1          0.6               0.75        50      0.7481549  0.6915215
  0.4  1          0.6               0.75       100      0.7942784  0.6990128
  0.4  1          0.6               0.75       150      0.8038704  0.7065041
  0.4  1          0.6               1.00        50      0.7529877  0.6891986
  0.4  1          0.6               1.00       100      0.7916412  0.7108595
  0.4  1          0.6               1.00       150      0.8067002  0.7229384
  0.4  1          0.8               0.50        50      0.7464143  0.6653891
  0.4  1          0.8               0.50       100      0.7786117  0.7160859
  0.4  1          0.8               0.50       150      0.8025226  0.7182927
  0.4  1          0.8               0.75        50      0.7484662  0.6939605
  0.4  1          0.8               0.75       100      0.7972548  0.7229965
  0.4  1          0.8               0.75       150      0.8067649  0.7228223
  0.4  1          0.8               1.00        50      0.7479288  0.7088269
  0.4  1          0.8               1.00       100      0.7907798  0.7205575
  0.4  1          0.8               1.00       150      0.8058220  0.7206736
  0.4  2          0.6               0.50        50      0.8276173  0.7420441
  0.4  2          0.6               0.50       100      0.8510545  0.7590592
  0.4  2          0.6               0.50       150      0.8616439  0.7710801
  0.4  2          0.6               0.75        50      0.8348463  0.7565621
  0.4  2          0.6               0.75       100      0.8624865  0.7710801
  0.4  2          0.6               0.75       150      0.8668144  0.7640534
  0.4  2          0.6               1.00        50      0.8569238  0.7831591
  0.4  2          0.6               1.00       100      0.8699798  0.7566783
  0.4  2          0.6               1.00       150      0.8755374  0.7879791
  0.4  2          0.8               0.50        50      0.8382751  0.7496516
  0.4  2          0.8               0.50       100      0.8541175  0.7496516
  0.4  2          0.8               0.50       150      0.8641747  0.7496516
  0.4  2          0.8               0.75        50      0.8544160  0.7614402
  0.4  2          0.8               0.75       100      0.8674247  0.7735192
  0.4  2          0.8               0.75       150      0.8743485  0.7736934
  0.4  2          0.8               1.00        50      0.8537507  0.7446574
  0.4  2          0.8               1.00       100      0.8691485  0.7686992
  0.4  2          0.8               1.00       150      0.8722538  0.7712544
  0.4  3          0.6               0.50        50      0.8509668  0.7688153
  0.4  3          0.6               0.50       100      0.8655012  0.7759582
  0.4  3          0.6               0.50       150      0.8748691  0.7806620
  0.4  3          0.6               0.75        50      0.8720236  0.7735192
  0.4  3          0.6               0.75       100      0.8751401  0.7882695
  0.4  3          0.6               0.75       150      0.8783819  0.8000581
  0.4  3          0.6               1.00        50      0.8707276  0.7880952
  0.4  3          0.6               1.00       100      0.8847046  0.7978513
  0.4  3          0.6               1.00       150      0.8892363  0.7930314
  0.4  3          0.8               0.50        50      0.8521752  0.7349593
  0.4  3          0.8               0.50       100      0.8630937  0.7565621
  0.4  3          0.8               0.50       150      0.8679809  0.7710801
  0.4  3          0.8               0.75        50      0.8554527  0.7642276
  0.4  3          0.8               0.75       100      0.8730291  0.7713124
  0.4  3          0.8               0.75       150      0.8730515  0.7808362
  0.4  3          0.8               1.00        50      0.8838967  0.7879210
  0.4  3          0.8               1.00       100      0.8934256  0.7832172
  0.4  3          0.8               1.00       150      0.8954184  0.7855981
  Spec     
  0.6649123
  0.7192982
  0.7280702
  0.6456140
  0.6929825
  0.7228070
  0.6315789
  0.7140351
  0.7385965
  0.6385965
  0.7105263
  0.7421053
  0.6561404
  0.7105263
  0.7421053
  0.6614035
  0.7175439
  0.7298246
  0.7666667
  0.8070175
  0.8263158
  0.7578947
  0.8105263
  0.8192982
  0.7543860
  0.7859649
  0.8052632
  0.7456140
  0.7859649
  0.7894737
  0.7807018
  0.8122807
  0.8000000
  0.7666667
  0.7964912
  0.8210526
  0.8070175
  0.8157895
  0.8263158
  0.8175439
  0.8315789
  0.8245614
  0.8298246
  0.8421053
  0.8491228
  0.7877193
  0.8280702
  0.8228070
  0.8245614
  0.8263158
  0.8438596
  0.8035088
  0.8263158
  0.8350877
  0.7140351
  0.7403509
  0.7526316
  0.6684211
  0.7333333
  0.7491228
  0.6824561
  0.7192982
  0.7421053
  0.6964912
  0.7175439
  0.7280702
  0.6701754
  0.7438596
  0.7456140
  0.6578947
  0.7192982
  0.7438596
  0.7859649
  0.7929825
  0.7912281
  0.7701754
  0.8035088
  0.8087719
  0.7947368
  0.8263158
  0.8298246
  0.7578947
  0.8228070
  0.8315789
  0.7929825
  0.8368421
  0.8385965
  0.8035088
  0.8192982
  0.8140351
  0.7964912
  0.8122807
  0.8263158
  0.8280702
  0.8350877
  0.8368421
  0.8280702
  0.8403509
  0.8456140
  0.8070175
  0.8280702
  0.8333333
  0.8000000
  0.8192982
  0.8140351
  0.8385965
  0.8403509
  0.8421053

Tuning parameter 'gamma' was held constant at a value of 0
Tuning
 parameter 'min_child_weight' was held constant at a value of 1
ROC was used to select the optimal model using  the largest value.
The final values used for the model were nrounds = 150, max_depth = 3, eta
 = 0.4, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1 and subsample
 = 1.

In [242]:
c5grid <- expand.grid( .winnow = c(TRUE,FALSE), .trials=c(1,5,10,15,20,25), .model=c("rules","tree"))

set.seed(13)
fit.c5 <- train(formula, data = sktrain, method = "C5.0", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, 
                trControl = control, na.action = na.pass, tuneGrid=c5grid)

In [243]:
fit.c5


C5.0 

985 samples
 61 predictor
  2 classes: 'class0', 'class1' 

Pre-processing: median imputation (70), Box-Cox transformation (22), remove (1) 
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 886, 887, 886, 886, 887, 886, ... 
Addtional sampling using up-sampling prior to pre-processing

Resampling results across tuning parameters:

  model  winnow  trials  ROC        Sens       Spec     
  rules  FALSE    1      0.7779046  0.7664344  0.7333333
  rules  FALSE    5      0.8414741  0.7616725  0.7842105
  rules  FALSE   10      0.8556600  0.7544715  0.8298246
  rules  FALSE   15      0.8764747  0.7542973  0.8526316
  rules  FALSE   20      0.8839905  0.7591173  0.8596491
  rules  FALSE   25      0.8848565  0.7519164  0.8614035
  rules   TRUE    1      0.7911471  0.7518002  0.8000000
  rules   TRUE    5      0.8593633  0.7638211  0.8157895
  rules   TRUE   10      0.8694455  0.7498258  0.8456140
  rules   TRUE   15      0.8777971  0.7617886  0.8473684
  rules   TRUE   20      0.8831683  0.7616725  0.8543860
  rules   TRUE   25      0.8857072  0.7544135  0.8666667
  tree   FALSE    1      0.7685346  0.6962834  0.7421053
  tree   FALSE    5      0.8194052  0.7154472  0.7824561
  tree   FALSE   10      0.8537136  0.7542393  0.8087719
  tree   FALSE   15      0.8688938  0.7470383  0.8210526
  tree   FALSE   20      0.8803717  0.7614402  0.8333333
  tree   FALSE   25      0.8816248  0.7831010  0.8228070
  tree    TRUE    1      0.7745731  0.7181185  0.7543860
  tree    TRUE    5      0.8336624  0.7300232  0.7912281
  tree    TRUE   10      0.8536280  0.7516260  0.8122807
  tree    TRUE   15      0.8677308  0.7782230  0.8245614
  tree    TRUE   20      0.8750754  0.7831591  0.8350877
  tree    TRUE   25      0.8750499  0.7902439  0.8280702

ROC was used to select the optimal model using  the largest value.
The final values used for the model were trials = 25, model = rules and
 winnow = TRUE.

In [244]:
# set up k-fold cross validation and metric
control <- trainControl(
    method = "cv"
    , number = 10
    , sampling = "up"
    , summaryFunction=twoClassSummary
    , classProbs = TRUE
    , search='random'
)
#metric <- "Accuracy"


set.seed(13)
fit.c5 <- train(formula, data = sktrain, method = "C5.0", preProcess = c('zv','medianImpute','BoxCox')
                , metric = metric
                , trControl = control
                , na.action = na.pass
                , tuneLengh=20)

In [245]:
fit.c5


C5.0 

985 samples
 61 predictor
  2 classes: 'class0', 'class1' 

Pre-processing: median imputation (70), Box-Cox transformation (22), remove (1) 
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 886, 887, 886, 886, 887, 886, ... 
Addtional sampling using up-sampling prior to pre-processing

Resampling results across tuning parameters:

  winnow  trials  ROC        Sens       Spec     
  FALSE   56      0.8916030  0.7615563  0.8649123
   TRUE   32      0.8898965  0.7567364  0.8578947
   TRUE   99      0.8949620  0.7542973  0.8736842

Tuning parameter 'model' was held constant at a value of rules
ROC was used to select the optimal model using  the largest value.
The final values used for the model were trials = 99, model = rules and
 winnow = TRUE.

In [246]:
plot(varImp(fit.c5), top = 50)



In [247]:
plot(varImp(fit.xgb), top = 50)



In [248]:
#test predicting power on unseen validation set
xgbtuneprediction <- predict(fit.c5, newdata = validation, na.action = na.pass)

#Check the accuracy with a confusion matrix
confusionMatrix(xgbtuneprediction, validation$label_twopercent
                , positive = "class0" #convention is positive class is the rarest one
               )


Confusion Matrix and Statistics

          Reference
Prediction class0 class1
    class0    166     37
    class1     38    243
                                          
               Accuracy : 0.845           
                 95% CI : (0.8097, 0.8761)
    No Information Rate : 0.5785          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.682           
 Mcnemar's Test P-Value : 1               
                                          
            Sensitivity : 0.8137          
            Specificity : 0.8679          
         Pos Pred Value : 0.8177          
         Neg Pred Value : 0.8648          
             Prevalence : 0.4215          
         Detection Rate : 0.3430          
   Detection Prevalence : 0.4194          
      Balanced Accuracy : 0.8408          
                                          
       'Positive' Class : class0          
                                          

In [249]:
#predict on latest dates
exend$prediction <- predict(fit.c5, newdata = exend, na.action = na.pass)
predictionprob <- predict(fit.c5, newdata = exend, na.action = na.pass, type = "prob")

exend <- cbind(exend, predictionprob)

In [250]:
glimpse(exend)


Observations: 15
Variables: 76
$ date                <date> 2017-08-23, 2017-08-24, 2017-08-25, 2017-08-28...
$ open                <dbl> 142.48, 143.01, 142.82, 142.40, 141.19, 143.09,...
$ high                <dbl> 143.01, 143.17, 143.18, 142.74, 143.21, 144.89,...
$ low                 <dbl> 142.38, 141.47, 141.78, 141.95, 141.06, 143.00,...
$ close               <dbl> 142.69, 142.27, 141.97, 142.41, 142.97, 144.65,...
$ volume              <dbl> 25702300, 38160800, 26380400, 15362100, 2449830...
$ adjusted            <dbl> 142.69, 142.27, 141.97, 142.41, 142.97, 144.65,...
$ runMaxHigh          <dbl> 145.47, 145.47, 145.47, 145.47, 145.47, 144.96,...
$ runMinLow           <dbl> 140.18, 140.18, 140.18, 140.18, 140.18, 140.18,...
$ label_twopercent    <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ dayofweek           <dbl> 4, 5, 6, 2, 3, 4, 5, 6, 3, 4, 5, 6, 2, 3, 4
$ dayofmonth          <int> 23, 24, 25, 28, 29, 30, 31, 1, 5, 6, 7, 8, 11, ...
$ dayofyear           <dbl> 235, 236, 237, 240, 241, 242, 243, 244, 248, 24...
$ weekofyear          <dbl> 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36,...
$ monthofyear         <dbl> 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9
$ year                <dbl> 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,...
$ EMA200              <dbl> 134.0019, 134.0931, 134.1835, 134.2687, 134.357...
$ EMA100              <dbl> 139.5684, 139.6398, 139.7099, 139.7699, 139.838...
$ EMA50               <dbl> 142.3672, 142.3987, 142.4293, 142.4415, 142.471...
$ EMA20               <dbl> 143.4144, 143.3911, 143.3710, 143.3109, 143.301...
$ EMA10               <dbl> 143.3004, 143.2767, 143.2591, 143.1647, 143.172...
$ EMA5                <dbl> 143.0151, 143.0667, 143.1045, 142.9830, 143.058...
$ PoM200              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ PoM100              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ PoM50               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ PoM20               <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ PoM10               <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ PoM5                <dbl> 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1
$ MoM100200           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ MoM50200            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ MoM20200            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ MoM50               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ MoM20               <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ MoM10               <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ MoM550              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ MoM520              <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ lagEMA200           <dbl> 133.9114, 134.0019, 134.0931, 134.1835, 134.268...
$ EMA200Slope         <dbl> 0.09053374, 0.09122495, 0.09041674, 0.08513896,...
$ EMA200SlopePN       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ lagEMA200SlopePN    <dbl> 0.09486071, 0.09053374, 0.09122495, 0.09041674,...
$ EMA200SlopePNchange <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ lagEMA50            <dbl> 142.3410, 142.3672, 142.3987, 142.4293, 142.441...
$ EMA50Slope          <dbl> 0.02623626, 0.03148190, 0.03063947, 0.01218302,...
$ EMA50SlopePN        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ lagEMA50SlopePN     <dbl> 0.04118468, 0.02623626, 0.03148190, 0.03063947,...
$ EMA50SlopePNchange  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ lagEMA20            <dbl> 143.4570, 143.4144, 143.3911, 143.3710, 143.310...
$ EMA20Slope          <dbl> -0.042570498, -0.023278069, -0.020108730, -0.06...
$ EMA20SlopePN        <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ lagEMA20SlopePN     <dbl> -0.011262129, -0.042570498, -0.023278069, -0.02...
$ EMA20SlopePNchange  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ lagEMA10            <dbl> 143.3649, 143.3004, 143.2767, 143.2591, 143.164...
$ EMA10Slope          <dbl> -0.064524731, -0.023702052, -0.017574406, -0.09...
$ EMA10SlopePN        <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ lagEMA10SlopePN     <dbl> -0.003308004, -0.064524731, -0.023702052, -0.01...
$ EMA10SlopePNchange  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ runMaxHighndays     <dbl> 145.47, 145.47, 145.47, 145.47, 144.96, 144.96,...
$ newhighYN           <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0
$ newhighndays        <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ runMinLowndays      <dbl> 140.18, 140.18, 140.18, 140.18, 140.18, 140.18,...
$ newlowYN            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ newlowndays         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ roc                 <dbl> -0.0082173164, -0.0016051928, 0.0053923587, -0....
$ rsi                 <dbl> 48.84327, 49.56637, 49.61431, 47.47605, 49.9570...
$ obv                 <dbl> 364898200, 403059000, 429439400, 414077300, 438...
$ volatility          <dbl> 0.14028578, 0.13448212, 0.11502228, 0.11506869,...
$ macd                <dbl> 0.053737291, 0.038514303, 0.026704024, -0.00735...
$ signal              <dbl> 0.2605737, 0.2161618, 0.1782703, 0.1411453, 0.1...
$ macddir             <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ aroonUp             <dbl> 5, 0, 35, 30, 25, 20, 100, 100, 95, 90, 85, 80,...
$ aroonDn             <dbl> 90, 85, 80, 75, 70, 65, 60, 55, 50, 45, 40, 35,...
$ oscillator          <dbl> -85, -85, -45, -45, -45, -45, 40, 45, 45, 45, 4...
$ aroondir            <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ prediction          <fctr> class1, class1, class1, class1, class1, class1...
$ class0              <dbl> 0.1983928, 0.1832918, 0.1936885, 0.1629757, 0.2...
$ class1              <dbl> 0.8016072, 0.8167082, 0.8063115, 0.8370243, 0.7...

In [251]:
# buy at open. set a good-til-cancel sell limit order for 2% above buy price. ex: buy_price * 1.02
dplyr::select(exend, date, prediction, 'class0', 'class1')


datepredictionclass0class1
2017-08-23class1 0.1983928 0.8016072
2017-08-24class1 0.1832918 0.8167082
2017-08-25class1 0.1936885 0.8063115
2017-08-28class1 0.1629757 0.8370243
2017-08-29class1 0.2771244 0.7228756
2017-08-30class1 0.2253515 0.7746485
2017-08-31class0 0.5919528 0.4080472
2017-09-01class0 0.6780399 0.3219601
2017-09-05class0 0.5498736 0.4501264
2017-09-06class1 0.4638385 0.5361615
2017-09-07class0 0.5007339 0.4992661
2017-09-08class1 0.3970528 0.6029472
2017-09-11class1 0.4418608 0.5581392
2017-09-12class1 0.4409946 0.5590054
2017-09-13class1 0.4056559 0.5943441

In [252]:
# scatter plot
# http://ggplot2.tidyverse.org/reference/geom_point.html
#p <- ggplot(validation, aes(percentchangeMax15, runMaxHigh, shape = factor(label_twopercent)))
#p + geom_point(aes(colour = factor(label_twopercent)), size = 4) +
#  geom_point(colour = "grey90", size = 1.5)

In [253]:
#test predicting power on unseen validation set
xgbtunepredictionprob <- predict(fit.xgb, newdata = validation, na.action = na.pass, type = "prob")

In [254]:
head(xgbtunepredictionprob)
head(xgbtunepredictionprob[,"class1"])


class0class1
0.12806720.8719328
0.10294610.8970539
0.10744400.8925560
0.46986680.5301332
0.19363370.8063663
0.83979930.1602007
  1. 0.871932804584503
  2. 0.897053852677345
  3. 0.892555966973305
  4. 0.530133247375488
  5. 0.806366339325905
  6. 0.160200655460358

In [255]:
rocCurve <- pROC::roc(response = validation$label_twopercent, predictor = xgbtunepredictionprob[,"class1"]
                      #, levels = rev(levels(validation$label_twopercent))
                     )
pROC::auc(rocCurve)
plot(rocCurve, legacy.axes = TRUE)


0.912920168067227

In [256]:
# i want to decrease my type II (false negatives) errors at the expense of my type I (false positive) errors
# meaning: when I get a buy signal of 1, I want to minimize the chance that it's incorrect. I don't care so much about 
# the incorrectness of when I'm not getting into a trade.

In [257]:
# scatter plot
# http://ggplot2.tidyverse.org/reference/geom_point.html
p <- ggplot(validation, aes(percentchangeMax15, runMaxHigh, shape = factor(prediction)))
p + geom_point(aes(colour = factor(prediction)), size = 4) +
  geom_point(colour = "grey90", size = 1.5)



In [ ]:


In [258]:
d <- cbind(validation, pred = xgbtuneprediction)
d


dateopenhighlowclosevolumeadjustedrunMaxHighrunMinLowlabel_twopercent...volatilitymacdsignalmacddiraroonUparoonDnoscillatoraroondirpredictionpred
2011-10-20 60.922 60.987 59.776 56.59 80378300 52.81651 62.455 53.679 class1 ... 0.20222955 1.2944856 0.824077631 90 40 50 1 class1 class1
2011-10-24 61.619 62.840 61.405 58.49 61449500 54.58982 62.840 53.679 class0 ... 0.19031778 1.4544155 1.028738881 100 30 70 1 class1 class1
2011-10-26 61.780 61.908 60.226 57.27 98193800 53.45116 62.840 55.351 class0 ... 0.19732407 1.4424326 1.186837691 90 20 70 1 class1 class1
2011-10-27 62.830 63.430 62.176 58.85 104455500 54.92582 63.430 57.012 class0 ... 0.21831081 1.5778117 1.265032501 100 15 85 1 class1 class0
2011-11-01 60.558 62.047 60.130 56.44 99933000 52.67652 63.430 59.679 class1 ... 0.23044586 1.4779623 1.411430961 85 0 85 1 class1 class1
2011-11-04 61.951 62.197 61.308 57.80 61527800 53.94582 63.430 59.776 class1 ... 0.21264268 1.1260423 1.299645820 70 0 70 1 class0 class0
2011-11-07 61.887 62.412 61.255 58.21 65559000 54.32848 63.430 59.776 class0 ... 0.21201263 1.0840847 1.256533600 65 0 65 1 class0 class0
2011-11-21 58.554 58.640 57.622 54.34 52283700 50.71654 63.194 57.622 class1 ... 0.19798775 -0.2357549 0.496686380 15 100 -85 0 class1 class1
2011-11-23 57.976 58.040 57.065 53.29 48437300 49.73656 63.194 57.065 class1 ... 0.18915460 -0.8913937 0.049129040 5 100 -95 0 class1 class1
2011-11-25 56.851 57.504 56.647 52.88 25448100 49.35390 63.194 56.647 class1 ... 0.15933217 -1.2101357 -0.202723910 0 100 -100 0 class1 class1
2011-12-09 60.280 61.276 60.258 57.02 55757900 53.21784 61.737 56.647 class0 ... 0.13712130 0.0829594 -0.258868091 15 50 -35 0 class0 class0
2011-12-12 60.451 61.083 59.894 56.38 55411700 52.62052 61.737 56.647 class0 ... 0.14329760 0.1033070 -0.186433081 10 45 -35 0 class1 class0
2011-12-19 58.831 58.970 57.869 54.32 46733900 50.84743 61.737 57.869 class1 ... 0.12684994 -0.5281527 -0.256245380 50 20 30 1 class1 class1
2011-12-21 59.397 59.440 58.211 55.13 78552300 51.60565 61.737 57.869 class1 ... 0.16472813 -0.5770153 -0.364248180 40 10 30 1 class1 class1
2011-12-29 59.431 59.804 59.239 55.99 28077900 52.45650 61.437 57.869 class1 ... 0.11600124 -0.4202588 -0.441048941 15 65 -50 0 class1 class1
2012-01-03 60.743 61.042 60.573 56.90 39514100 53.30908 61.276 57.869 class1 ... 0.13864116 -0.2185220 -0.389136351 5 55 -50 0 class1 class1
2012-01-09 61.864 61.886 61.299 57.62 39195500 53.98363 61.886 57.869 class1 ... 0.11065166 0.4170717 -0.026090481 100 35 65 1 class1 class1
2012-01-12 62.216 62.409 61.757 58.39 26188300 54.70504 62.409 58.211 class1 ... 0.09963956 0.7834547 0.327063171 100 20 80 1 class1 class1
2012-01-13 61.992 62.120 61.565 58.18 35980400 54.50829 62.409 58.211 class1 ... 0.11102668 0.8140852 0.424467571 95 15 80 1 class1 class1
2012-01-18 62.782 63.497 62.633 59.49 48692700 55.73563 63.497 59.174 class1 ... 0.09011154 1.0887558 0.638793901 100 5 95 1 class1 class1
2012-01-23 63.807 64.266 63.444 59.79 40958300 56.01669 64.266 59.239 class1 ... 0.09819634 1.4752341 1.001281041 100 0 100 1 class1 class1
2012-01-25 64.319 64.661 63.860 60.43 61591800 56.61630 64.661 60.370 class1 ... 0.11729674 1.5975564 1.199965791 100 10 90 1 class1 class1
2012-01-30 63.924 64.618 63.679 60.45 42797800 56.63504 64.896 61.266 class1 ... 0.10624210 1.6881151 1.439195181 90 0 90 1 class1 class1
2012-02-07 66.123 66.443 65.824 62.13 37935700 58.20902 66.443 62.430 class1 ... 0.07573786 1.9676830 1.757968861 100 0 100 1 class1 class1
2012-02-08 66.326 66.667 66.080 62.46 41171100 58.51819 66.667 62.633 class1 ... 0.07568267 2.0033845 1.807052001 100 0 100 1 class1 class1
2012-02-10 66.667 66.859 66.443 62.47 58090500 58.52756 67.222 63.412 class1 ... 0.08112835 2.0621742 1.900954861 95 5 90 1 class1 class1
2012-02-22 67.767 67.970 67.489 63.32 43174700 59.32391 68.194 64.191 class1 ... 0.08971698 2.0611228 2.072611930 95 0 95 1 class1 class1
2012-02-27 67.895 68.589 67.596 64.05 42678500 60.00784 68.589 65.813 class1 ... 0.06373729 1.9373445 2.019309940 100 5 95 1 class1 class1
2012-02-28 68.428 69.058 68.343 64.70 43717500 60.61682 69.058 65.813 class1 ... 0.06863468 1.9481082 2.005069590 100 0 100 1 class1 class1
2012-03-05 69.197 69.282 68.300 64.20 45788800 60.14837 69.464 66.443 class1 ... 0.05237641 1.8873478 1.961873660 95 5 90 1 class1 class1
.............................. ..............................
2017-03-02 132.042 132.062 131.248 130.78 19951400 130.1540 132.313 126.485 class0 ... 0.06153153 1.4189154 1.401896091 95 5 90 1 class0 class0
2017-03-08 131.218 131.781 131.088 130.74 15776000 130.1142 132.313 128.354 class0 ... 0.06070134 1.2062941 1.325282240 75 5 70 1 class0 class0
2017-03-09 131.339 131.650 130.766 130.84 20855300 130.2137 132.313 128.987 class0 ... 0.05716685 1.1435400 1.288933800 70 0 70 1 class0 class0
2017-03-23 130.876 131.237 130.495 130.36 20311700 130.0058 133.047 130.033 class1 ... 0.08785661 0.6608826 0.915978240 70 5 65 1 class1 class1
2017-04-07 132.360 132.731 131.858 131.97 16743100 131.6114 133.844 129.753 class1 ... 0.07565833 0.5079876 0.577473670 90 55 35 1 class1 class1
2017-04-10 132.460 132.931 132.019 132.02 15619300 131.6613 133.844 129.753 class1 ... 0.06601224 0.4825250 0.558483930 85 50 35 1 class1 class1
2017-04-11 132.259 132.460 130.856 131.45 33778400 131.0929 133.844 129.753 class1 ... 0.06843984 0.4287522 0.532537590 80 45 35 1 class1 class1
2017-04-12 131.788 131.898 131.136 130.92 15955200 130.5643 133.844 129.753 class1 ... 0.06974607 0.3479304 0.495616150 75 40 35 1 class1 class1
2017-04-17 131.126 131.858 131.076 131.48 13325500 131.1228 133.844 129.753 class1 ... 0.06861393 0.2208946 0.405985450 65 30 35 1 class1 class1
2017-05-02 137.744 137.865 137.393 137.43 18345100 137.0566 137.865 130.735 class0 ... 0.06556021 1.0380507 0.696175681 100 40 60 1 class0 class1
2017-05-03 137.494 137.544 136.992 136.99 23827600 136.6178 137.865 130.735 class1 ... 0.07452137 1.0838608 0.773712701 95 35 60 1 class1 class1
2017-05-16 139.780 140.020 139.369 139.62 21786000 139.2407 140.020 134.846 class1 ... 0.03592924 1.2526168 1.159794081 100 0 100 1 class1 class1
2017-05-19 138.125 138.797 138.035 137.84 36730100 137.4655 140.020 136.140 class1 ... 0.07787631 1.0371777 1.131027300 85 0 85 1 class1 class1
2017-05-30 141.535 141.896 141.444 141.34 20306400 140.9560 141.896 136.240 class0 ... 0.09087732 1.1771986 1.108261851 100 65 35 1 class0 class0
2017-06-07 143.650 143.931 142.988 143.42 18842400 143.0303 144.122 136.240 class0 ... 0.07466852 1.4138624 1.289609061 95 35 60 1 class0 class0
2017-06-09 144.132 144.292 138.486 139.98 109783700 139.5997 144.292 136.240 class1 ... 0.05763833 1.4011624 1.331637631 100 25 75 1 class0 class0
2017-06-12 139.178 139.860 137.845 139.23 104454000 138.8517 144.292 137.845 class0 ... 0.18796721 1.1278509 1.290880280 95 20 75 1 class1 class0
2017-06-14 141.314 141.314 138.998 139.75 60093800 139.3703 144.292 137.845 class1 ... 0.19466103 0.8411710 1.148246730 85 10 75 1 class0 class0
2017-06-15 138.717 139.659 137.865 139.13 55145200 138.7520 144.292 137.845 class1 ... 0.18798067 0.6441403 1.047425450 80 5 75 1 class0 class1
2017-06-23 140.540 141.420 140.150 141.24 21654300 141.2400 144.292 137.640 class0 ... 0.11613826 0.2613293 0.491350910 50 75 -25 0 class0 class0
2017-06-26 142.050 142.290 140.270 140.58 34411100 140.5800 144.292 137.640 class1 ... 0.11355353 0.3095085 0.454982430 45 70 -25 0 class0 class0
2017-07-03 138.270 138.430 136.100 136.19 32797000 136.1900 142.290 136.100 class1 ... 0.09911522 -0.1723345 0.146031190 20 100 -80 0 class1 class1
2017-07-05 136.620 137.900 136.160 137.53 42116600 137.5300 142.290 136.100 class1 ... 0.09677285 -0.2854596 0.059733030 15 95 -80 0 class1 class1
2017-07-18 142.090 143.170 141.640 143.14 25748500 143.1400 143.170 135.800 class0 ... 0.07973609 0.3025613 -0.021498901 100 60 40 1 class0 class1
2017-07-20 144.350 144.440 143.520 144.17 36139400 144.1700 144.440 135.800 class0 ... 0.05280637 0.6006741 0.180797121 100 50 50 1 class0 class0
2017-07-27 145.820 145.960 142.300 143.96 80665900 143.9600 145.960 135.800 class0 ... 0.05241483 0.9703152 0.637897191 100 25 75 1 class0 class0
2017-07-28 143.100 144.080 142.870 143.84 38013200 143.8400 145.960 136.740 class0 ... 0.09977670 0.9200750 0.694332761 95 20 75 1 class0 class0
2017-08-09 143.120 144.190 142.790 144.12 36169500 144.1200 145.960 142.300 class1 ... 0.10647857 0.6439247 0.708308170 55 0 55 1 class0 class0
2017-08-15 144.290 144.300 143.640 144.03 25509200 144.0300 145.960 140.890 class0 ... 0.10943652 0.3887811 0.540170520 35 85 -50 0 class1 class1
2017-08-18 141.410 142.200 140.650 141.23 60751900 141.2300 145.470 140.650 class1 ... 0.13158114 0.2408532 0.437600780 20 100 -80 0 class1 class1

In [259]:
options(tibble.width = Inf)
tb <- dplyr::select(d,
       everything()
       ) %>%
    dplyr::filter(., label_twopercent == '0' & pred == '1')

tb


dateopenhighlowclosevolumeadjustedrunMaxHighrunMinLowlabel_twopercent...volatilitymacdsignalmacddiraroonUparoonDnoscillatoraroondirpredictionpred

In [ ]:


In [ ]: